Getting names data

This data comes from kaggle.com, which has collected all recorded names for children born in the United States between the years 1880-2014.

## first read in baby names csv
## names <- read.csv('Baby-Name-Project/data/raw_data/NationalNames.csv')
## save as RDS and remove CSV to save space
names <- readRDS('Baby-Name-Project/data/raw_data/all-names.rds')

## reading in baby names by stat
## state <- read.csv('Baby-Name-Project/data/raw_data/StateNames.csv')
## save as RDS file and remove CSV
state <- readRDS('Baby-Name-Project/data/raw_data/state-names.rds')

Top 10 boys names

Exploratory analysis of the top 10 boys names

## summing totals and organizing into descending order (most to least)
descending <- names %>%
  filter(Gender == 'M') %>%
  group_by(Name) %>%
  select(Name, Count) %>%
  summarise(total=sum(Count)) %>%
  arrange(desc(total))

## making table that displays top 10 names
top10 = descending[1:10,]

kable(top10, digits=0, 'html', caption = 'Top 10 Names for Baby Boys, US 1880-2014') %>% 
  kableExtra::kable_styling('striped', 'bordered') %>%
  kableExtra::footnote(general='Kaggle.com', general_title = 'Source: ', footnote_as_chunk = T)
Top 10 Names for Baby Boys, US 1880-2014
Name total
James 5105919
John 5084943
Robert 4796695
Michael 4309198
William 4055473
David 3577704
Joseph 2570095
Richard 2555330
Charles 2364332
Thomas 2283080
Source: Kaggle.com

Top 10 girls names

Exploratory analysis of the top 10 girls names.

## summing totals and organizing into descending order (most to least)
descending <- names %>%
  filter(Gender == 'F') %>%
  group_by(Name) %>%
  select(Name, Count) %>%
  summarise(total=sum(Count)) %>%
  arrange(desc(total))

## making table that displays top 10 names
top10 = descending[1:10,]

kable(top10, digits=0, 'html', caption = 'Top 10 Names for Baby Girls, US 1880-2014') %>% 
  kableExtra::kable_styling('striped', 'bordered') %>%
  kableExtra::footnote(general='Kaggle.com', general_title = 'Source: ', footnote_as_chunk = T)
Top 10 Names for Baby Girls, US 1880-2014
Name total
Mary 4115282
Elizabeth 1601128
Patricia 1570567
Jennifer 1462742
Linda 1450843
Barbara 1432944
Margaret 1240006
Susan 1120469
Dorothy 1105680
Sarah 1060643
Source: Kaggle.com

Limiting to boys named Nathan and Nathan-realted Names

My main interest in this data set was to see how popular my own name is, specifically in comparison to popular variations.

## filter out only babies with names 'Nathan' or 'Nate' and 'Male'
## this is accomplished using the filter() function available through dplyr package
dnn <- names %>% 
  filter(Gender == 'M', 
         Name == 'Nathan' | Name == 'Nate' | Name == 'Nathanial' | Name == 'Nathaniel' | Name == 'Nathanael')

## filtering out all other names for each state
state_dnn <- state %>% 
  filter(Gender == 'M', Name == 'Nathan')

## summing total number of Nathan's for each state
state_dnn_sum <- state_dnn %>% 
  group_by(State) %>%
  select(Name, state=State, Count) %>%
  summarize(total=sum(Count))

## getting total number of people per state
sum_names_state <- state %>%
  group_by(State) %>%
  select(state=State, Count) %>%
  summarize(total=sum(Count))

## getting proportion of Nathans for each state
## use set_colnames to alter header bc gets changed
prop_N <- data.frame(sum_names_state$state, (state_dnn_sum$total/sum_names_state$total)) %>%
  magrittr::set_colnames(c('state', 'total'))%>% 
  (function(x){
    df <- data_frame(state=x$state, prop = x$total*100)})

## summing total Nathaniels by state and filtering
state_dnnat <- state %>% 
  filter(Gender == 'M', Name == 'Nathaniel')

state_dnnat_sum <- state_dnnat %>% 
  group_by(State) %>%
  select(Name, state=State, Count) %>%
  summarize(total=sum(Count))

prop_Nat <- data.frame(sum_names_state$state, (state_dnnat_sum$total/sum_names_state$total)) %>%
  magrittr::set_colnames(c('state', 'total')) %>% 
  (function(x){
    df <- data_frame(state=x$state, prop = x$total*100)})

Interactive Plot

Basically the same graph as above, except that it has an interactive component.

## interactive plot
p2 <- ggplot(data=dnn, aes(x=Year, y=Count, color=Name)) + 
  geom_line() + 
  geom_point() + 
  ylab('Number of babies') + 
  theme(
    plot.margin = unit(c(1,1,1,1), 'lines')
  )

## in order to make interactive, we will view the plot using ggplotly() function.
ggplotly(p2)
## you should be able to hover mouse over individual points to see count and year
## if you don't want the graph to show up inside R markdown file:
    ## click on gear aside of knit at top
    ## select 'Chuck Output in Console'

Mapping total number of Nathans by state

Making a heatplot for proportion of Nathans born in each state (shown as percentage relative to the counts of all other names). For example, say a state has a value of 0.4%. This means that 0.4% of all people born in that state since 1880 were named Nathan. That’s a frequency of 4 in 1000 people.

p3 <- plot_usmap(data=prop_N, values = 'prop') + 
  scale_fill_gradient(name = 'Percentage', low='blue', high='red') + 
  labs(title='Percentage of Babies Named Nathan By State', 
       subtitle = 'US babies born 1880-2014', 
       caption='Source: Kaggle.com') + 
  theme(
    legend.position = 'right',
    legend.title = element_text(size=11, face='bold'),
    legend.text = element_text(size=9),
    plot.title = element_text(size=16, face='bold'),
    plot.subtitle = element_text(size=13),
    plot.caption = element_text(size=9)
  )

p3

The major take aways from this graph: Nathan is most popular in Utah (intestingly) and least popular in New Jersey.

Mapping total Nathaniels by State

Same principle as the above graph but using Nathaniel instead of Nathan.

p4 <- plot_usmap(data=prop_Nat, values = 'prop') + 
  scale_fill_gradient(name = 'Percentage', low='blue', high='red') + 
  labs(title='Percentage of Babies Named Nathaniel By State', 
       subtitle = 'US babies born 1880-2014', 
       caption='Source: Kaggle.com') + 
  theme(
    legend.position = 'right',
    legend.title = element_text(size=11, face='bold'),
    legend.text = element_text(size=9),
    plot.title = element_text(size=16, face='bold'),
    plot.subtitle = element_text(size=13),
    plot.caption = element_text(size=9)
  )

p4

The major takeaways from this graph: Nathaniel is most popular in South Carolina and least popular in North Dakota. However, if we take a look at the scale bar for this graph and compare it to the one for the Nathan-By-State graph, it’s clear that Nathan is the more popular name.

Note Remember, this is for the entire dataset. There are likely to be local trends (e.g., decade-specific trends). See the Nathan-related names graph above for reference.

State preference for Nathan or Nathaniel

## combining Nathan and Nathaniel proportions into single data frame
props <- data.frame(prop_N$state, prop_N$prop, prop_Nat$prop)
colnames(props) <- c('state', 'Nathan', 'Nathaniel')

## writing function that will determine which is more popular for each state
## uses numeric representation: 1==Nathan, 2==Nathaniel
nathan_or_nathaniel <- function(nathan,nathaniel){
  bigger = c()
  for (val in 1:length(nathan)){
    if (nathan[val] > nathaniel[val]){
      bigger = append(bigger, '1')
    }else{
      bigger = append(bigger, '2')
    }
  }
  return(bigger)
}

## running function and adding as new column to props data frame
non <- nathan_or_nathaniel(props$Nathan, props$Nathaniel)
props$non <- non

## determining which is more popular by state
det_nvn <- function(non){
  one = c()
  two = c()
  for (val in non){
    if (val == '1'){
      one = append(one, 1)
    }else{
      two = append(two, 1)
    }
  }
  if (length(one) > length(two)){
    return(paste0('Majority of states (including DC) prefer Nathan: ', length(one), ' states'))
  }else{
    return(paste0('Majority of states (including DC) prefer Nathaniel: ', length(two), ' states'))
  }
}

which_is_more <- det_nvn(props$non)

## plotting by state
p_non <- plot_usmap(data=props, values = 'non') + 
  scale_fill_manual(name='Variation', 
                    labels = c('Nathan', 'Nathaniel'), 
                    values=c('Peach Puff', 'Tomato')) + 
  labs(title='Nathan vs Nathaniel By State', 
       subtitle = paste0('US babies born 1880-2014\n\n', which_is_more), 
       caption='Source: Kaggle.com') + 
  theme(
    legend.position = 'right',
    legend.title = element_text(face='bold', size=13),
    legend.text = element_text(size=11),
    plot.title = element_text(size=16, face='bold'),
    plot.subtitle = element_text(size=13),
    plot.caption = element_text(size=9)
  )
p_non

This figure provides a nice visulation of what we already could’ve guessed from the two heatmaps above. Nathan is the more popular variation for an overwhelming majority of states. The only two exceptions are South Carolina (0.17% v 0.12%) and Washington DC (0.14% v 0.11%).